Background

This module builds on code contained in Coronavirus_Statistics_USAF_v004.Rmd. This file includes the latest code for analyzing data from USA Facts. USA Facts maintains data on cases and deaths by county for coronavirus in the US. Downloaded data are unique by county with date as a column and a separate file for each of cases, deaths, and population.

The intent of this code is to source updated functions that allow for readRunUSAFacts() to be run to obtain, read, process, and analyze data from USA Facts.

Sourcing Functions

The tidyverse library is loaded, and the functions used for CDC daily processing are sourced. Additionally, specific functions for USA Facts are also sourced:

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
source("./Coronavirus_USAF_Functions_v001.R")

Further, the mapping file specific to USA Facts is sourced:

source("./Coronavirus_USAF_Default_Mappings_v001.R")

A few functions should be added back to Generic_Added_…v001.R and Coronavirus_CDC_Daily…_v001.R after they have been more thoroughly checked for compatibility with the state-based clustering. For now, they are included below so as to over-write the function obtained from source(…):

# Updated function for handling county-level clusters
createSummary <- function(df, 
                          stateClusterDF=NULL,
                          brewPalette=NA, 
                          dataType="state"
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: an integrated data frame by cluster-date
    # stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
    # brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
    # dataType: the type of maps being produced ("state" or "county")
    
    # Create plots that can be relevant for a dashboard, including:
    # 1. Map of segments
    # 2. Bar plot of counts by segment
    # 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
    # 4. Facetted trend-line plot of burden by segments
    
    # Create a map of the clusters
    p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create a bar plot of counts by segment
    p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           countOnly=TRUE,
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create plot for population and burden by cluster
    p3 <- df %>%
        helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"), 
                       mapper=c("pop"="Population (millions)", 
                                "wm_tcpm7"="Cases per thousand", 
                                "wm_tdpm7"="Deaths per million"
                                ), 
                       xLab=NULL, 
                       yLab=NULL, 
                       title=NULL,
                       divideBy=c("pop"=1000000, "wm_tcpm7"=1000), 
                       extraArgs=if(is.na(brewPalette)) list() else 
                           list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                       )
    
    # Create plot for cumulative burden per million over time
    p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"), 
                   arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
                   )
    if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
    p4 <- df %>%
        helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL), 
                       mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)", 
                                "wm_tdpm7"="Deaths per million\n(cumulative)", 
                                "wm_hpm7"="Hospitalized per million\n(current)"
                                ),
                       yLab=NULL,
                       title=NULL, 
                       divideBy=c("wm_tcpm7"=1000), 
                       linesize=0.75,
                       extraArgs=p4xtra
                       )
    
    list(p1=p1, p2=p2, p3=p3, p4=p4)
    
}



# Helper function to make a summary map
helperSummaryMap <- function(df, 
                             mapLevel="states", 
                             keyCol="state",
                             values="cluster",
                             discreteValues=NULL,
                             legend.position="right",
                             labelScale=TRUE,
                             extraArgs=list(),
                             countOnly=FALSE,
                             textLabel=c(),
                             ...
                             ) {
    
    # FUNCTION ARGUMENTS:
    # df: a data frame containing a level of geography and an associated cluster
    # mapLevel: a parameter for whether the map is "states" or "counties"
    # keyCol: the key column for plotting (usmap::plot_usmap is particular, and this must be 'state' or 'fips')
    # values: the character name of the field containing the data to be plotted
    # discreteValues: boolean for whether the values are discrete (if not, use continuous)
    #                 NULL means infer from data
    # legend.position: character for the location of the legend in the plot
    # labelScale: boolean, should an scale_fill_ be created?  Use FALSE if contained in extraArgs
    # extraArgs: list of other arguments that will be appended as '+' to the end of the usmap::plot_usmap call
    # countOnly: should a bar plot of counts only be produced?
    # textLabel: a list of elements that should be labelled as text on the plot (too small to see)
    # ...: other parameters to be passed to usmap::plot_usmap (e.g., labels, include, exclude, etc.)
    
    # Modify the data frame to contain only the relevant data
    df <- df %>%
        select(all_of(c(keyCol, values))) %>%
        distinct()
    
    # Determine the type of data being plotted
    if (is.null(discreteValues)) discreteValues <- !is.numeric(df[[values]])
    
    # Convert data type if needed
    if (isTRUE(discreteValues) & is.numeric(df[[values]])) 
        df[[values]] <- factor(df[[values]])
    
    # If count only is needed, create a count map; otherwise create a map
    if (isTRUE(countOnly)) { 
        gg <- df %>%
            ggplot(aes(x=fct_rev(get(values)))) + 
            geom_bar(aes_string(fill=values)) + 
            stat_count(aes(label=..count.., y=..count../2), 
                       geom="text", 
                       position="identity", 
                       fontface="bold"
                       ) +
            coord_flip() + 
            labs(y="Number of members", x="")
    } else {
        if(keyCol=="countyFIPS") {
            df <- df %>% colRenamer(vecRename=c("countyFIPS"="fips"))
            keyCol <- "fips"
        }
        gg <- usmap::plot_usmap(regions=mapLevel, data=df, values=values, ...)
        if (length(textLabel) > 0) {
            labDF <- df %>% 
                filter(get(keyCol) %in% textLabel) %>%
                mutate(rk=match(get(keyCol), textLabel)) %>%
                arrange(rk) %>%
                mutate(lon=-70.1-seq(0, 0.8*length(textLabel)-0.8, by=0.8), 
                       lat=40.1-seq(0, 1.5*length(textLabel)-1.5, by=1.5)
                       ) %>%
                select(lon, lat, everything()) %>%
                usmap::usmap_transform()
            gg <- gg + geom_text(data=labDF, 
                                 aes(x=lon.1, y=lat.1, label=paste(get(keyCol), get(values))), 
                                 size=3.25
                                 )
        }
    }
    
    # Position the legend as requested
    gg <- gg + theme(legend.position=legend.position)
    
    # Create the scale if appropriate
    if (isTRUE(labelScale)) gg <- gg + 
        if(isTRUE(discreteValues)) scale_fill_discrete(values) else scale_fill_continuous(values)
    
    # Apply extra arguments
    for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
    
    # Return the map object
    gg
    
}



# Function to pivot the data file longer
pivotData <- function(df, 
                      pivotKeys, 
                      nameVar="name", 
                      valVar="value",
                      toLonger=TRUE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame
    # pivotKeys: the keys (everything but cols for pivot_longer, id_cols for pivot_wider)
    # nameVar: variable name for names_to or names_from
    # valVar: variable name for values_to or values_from
    # toLonger: boolean, should pivot_longer() be used rather than pivot_wider()?
    # ...: other arguments to be passed to pivot_*()

    if (isTRUE(toLonger)) pivot_longer(df, -all_of(pivotKeys), names_to=nameVar, values_to=valVar, ...)
    else pivot_wider(df, all_of(pivotKeys), names_from=all_of(nameVar), values_from=all_of(valVar), ...)
    
}

An existing processed USA Facts list is loaded for use as the comparison set:

cty_newformat_20201026 <- readFromRDS("cty_newformat_20201026")

A function is added to create a county-level cluster map with state borders:

sparseCountyClusterMap <- function(vec, 
                                   clustRemap=c("999"=NA), 
                                   caption=NULL, 
                                   brewPalette=NULL, 
                                   naFill="white"
                                   ) {
    
    # FUNCTION ARGUMENTS:
    # vec: a named vector where the names are countyFIPS and the values are the clusters
    #      can also be a data.frame, in which case clustersToFrame() and clustRemap are not used
    # clustRemap: remapping vector for clusters
    # caption: caption to be included (NULL means no caption)
    # brewPalette: name of a palette for scale_fill_brewer()
    #              NULL means use scale_fill_discrete() instead
    # naFill: fill to use for NA counties
    
    # Convert to a tibble for use with usmap if not already a data.frame
    if ("data.frame" %in% class(vec)) {
        df <- vec
    } else {
        df <- clustersToFrame(vec, colNameName="fips", convFactor=FALSE) %>%
            mutate(cluster=as.character(cluster), 
                   cluster=ifelse(cluster %in% names(clustRemap), clustRemap[cluster], cluster)
                   )
    }
    
    # Create a blank state map with black lines
    blankStates <- usmap::plot_usmap("states")
    
    # Create a county cluster map with NA values excluded
    dataCounties <- df %>% 
        filter(!is.na(cluster)) %>% 
        usmap::plot_usmap(regions="counties", data=., values="cluster")
    
    # Integrate as a ggplot object
    p1 <- ggplot() + 
        geom_polygon(data=dataCounties[[1]], 
                     aes(x=x, y=y, group=group, fill=dataCounties[[1]]$cluster), 
                     color = NA,
                     size = 0.1
                     ) +  
        geom_polygon(data=blankStates[[1]], 
                     aes(x=x, y=y, group=group), 
                     color = "black", 
                     lwd=1.25,
                     fill = alpha(0.001)
                     ) + 
        coord_equal() + 
        ggthemes::theme_map()
    
    # Add the appropriate fill (can use viridis_d also)
    if (is.null(brewPalette)) p1 <- p1 + scale_fill_discrete("Cluster", na.value=naFill)
    else if (brewPalette=="viridis") p1 <- p1 + scale_fill_viridis_d("Cluster", na.value=naFill)
    else p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette, na.value=naFill)
    
    # Add caption if requested
    if (!is.null(caption)) p1 <- p1 + labs(caption=caption)
    
    # Return the plotting object
    p1
    
}

The function is then run using previously created segments:

sparseCountyClusterMap(cty_newformat_20201026$useClusters, 
                       brewPalette="Paired", 
                       caption="Counties with population under 25k are blank"
                       )

Next steps are to load new data and compare against previous, while using existing segments:

readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv", 
                 "usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
                 )
compareList <- list("usafCase"=cty_newformat_20201026$dfRaw$usafCase, 
                    "usafDeath"=cty_newformat_20201026$dfRaw$usafDeath
                    )

# Create new clusters
cty_newdata_20210608 <- readRunUSAFacts(maxDate="2021-06-06", 
                                        downloadTo=lapply(readList, 
                                                          FUN=function(x) if(file.exists(x)) NA else x
                                                          ),
                                        readFrom=readList, 
                                        compareFile=compareList, 
                                        writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log", 
                                        ovrwriteLog=TRUE,
                                        useClusters=cty_newformat_20201026$useClusters,
                                        skipAssessmentPlots=FALSE,
                                        brewPalette="Paired"
                                        )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 551 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.

## 
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS 
## 
## 
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 29 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 640 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType   cases new_cases            n
##   <chr>    <dbl>     <dbl>        <dbl>
## 1 before 6.20e+9   3.28e+7 1602886     
## 2 after  6.18e+9   3.27e+7 1577284     
## 3 pctchg 4.58e-3   3.84e-3       0.0160
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
##   isType  deaths   new_deaths            n
##   <chr>    <dbl>        <dbl>        <dbl>
## 1 before 1.26e+8 590404       1602886     
## 2 after  1.25e+8 589015       1577284     
## 3 pctchg 3.75e-3      0.00235       0.0160

## NULL

sparseCountyClusterMap(cty_newdata_20210608$useClusters, 
                       brewPalette="Paired", 
                       caption="Counties with population under 25k are blank"
                       )

There has been significant convergence among segments in average deaths per million and cases per million. This is suggestive of several possibilities, such as that growth in burden may be inversely proportional to previous burden. Next steps are to create segments using the most recent data, seeking to identify differences in 1) cumulative burden, primarily defined by deaths, and 2) shape of the curve in getting to cumulative burden.

New segments are created, with assessments:

readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv", 
                 "usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
                 )

# Create new clusters
cty_newsegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06", 
                                        writeLog="./RInputFiles/Coronavirus/USAF_NewSegs_20210608_v005.log", 
                                        ovrwriteLog=TRUE,
                                        dfPerCapita=cty_newdata_20210608$dfPerCapita,
                                        useClusters=NULL,
                                        skipAssessmentPlots=FALSE,
                                        brewPalette="Paired", 
                                        defaultCluster="999",
                                        minPopCluster=40000, 
                                        hierarchical=NA,
                                        minShape="2020-04",
                                        maxShape="2021-05",
                                        ratioDeathvsCase = 5,
                                        ratioTotalvsShape = 0.5,
                                        minDeath=100,
                                        minCase=5000, 
                                        hmlSegs=3, 
                                        eslSegs=3, 
                                        seed=2106091243
                                        )
## 
## *** File has been checked for uniqueness by: countyFIPS date

## NULL

sparseCountyClusterMap(cty_newsegs_20210608$useClusters, 
                       brewPalette="Paired", 
                       caption="Counties with population under 40k are blank"
                       )

The function createSummary() is updated to 1) create the sparse summary map; and 2) exclude the small county segment from select plots:

# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst, 
                             lstExtract=fullListExtract,
                             clusterFrame=NULL,
                             brewPalette=NA, 
                             clusterType="state",
                             printSummary=TRUE, 
                             printDetailed=TRUE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # lst: a list containing processed clustering data
    # lstExtract: the elements to extract from lst with an optional function for converting the elements
    #             NULL means use the extracted element as-is
    # clusterFrame: tibble of the clusters to be plotted
    #               NULL means create from lst
    # brewPalette: the color palette to use with scale_*_brewer()
    #              default NA means use the standard color/fill profile
    # clusterType: character variable of form "state" for state clusters and "county" for county
    # printSummary: boolean, should summary plots be printed to the log?
    # printDetailed: boolean, should detailed plots be printed to the log?
    
    # Get the key variable (used for joins and the like)
    if (clusterType=="state") keyVar <- "state"
    else if (clusterType=="county") keyVar <- "countyFIPS"
    else stop(paste0("\nThe passed clusterType: ", clusterType, " is not programmed\n"))
    
    # Create clusterFrame from lst if it has been passed as NULL
    if (is.null(clusterFrame)) clusterFrame <- clustersToFrame(lst, colNameName=keyVar)
    
    # Create the integrated and aggregate data from lst
    dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame), keyJoin=keyVar)
    dfAgg <- combineAggData(dfFull, aggBy=plotCombineAggByMapper[[clusterType]])
    
    # Create the main summary plots
    summaryPlots <- createSummary(dfAgg, 
                                  stateClusterDF=clusterFrame, 
                                  brewPalette=brewPalette, 
                                  dataType=clusterType
    )
    
    # Create the detailed summaries
    detPlots <- createDetailedSummaries(dfDetail=dfFull, 
                                        dfAgg=dfAgg, 
                                        detVar=keyVar,
                                        p2DetMetrics=plotCombineAggByMapper[[clusterType]]$agg2$aggVars,
                                        brewPalette=brewPalette
    )
    
    # Print the summary plots if requested (use the sparse county plotting)
    if (isTRUE(printSummary)) {
        gridExtra::grid.arrange(summaryPlots$p5 + theme(legend.position="none"), 
                                summaryPlots$p3 + theme(legend.position="left"), 
                                summaryPlots$p4, 
                                layout_matrix=rbind(c(1, 2), 
                                                    c(3, 3)
                                )
        )
    }
    
    # Print the detailed plots if requested
    if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
    
    # Return a list of the key plotting files
    list(dfFull=dfFull, 
         dfAgg=dfAgg, 
         plotClusters=clusterFrame, 
         summaryPlots=summaryPlots, 
         detPlots=detPlots
    )
    
}



# Function to create detailed cluster summaries
createDetailedSummaries <- function(dfDetail, 
                                    dfAgg, 
                                    aggVar=c("cluster"), 
                                    detVar=c("state"),
                                    p1Metrics=c("tcpm", "tdpm"), 
                                    p1Order=c("tdpm"), 
                                    p2DetMetrics=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
                                    p2AggMetrics=paste0("wm_", p2DetMetrics),
                                    p3Metrics=p1Metrics,
                                    p3Days=30,
                                    p3Slope=0.25,
                                    mapper=c("tcpm"="Cases per million\n(cumulative)", 
                                             "tdpm"="Deaths per million\n(cumulative)", 
                                             "cpm7"="Cases\nper million", 
                                             "dpm7"="Deaths\nper million",
                                             "hpm7"="Hospitalized\nper million",
                                             "tdpm7"="Deaths (cum)\nper million",
                                             "tcpm7"="Cases (cum)\nper million"
                                    ),
                                    brewPalette=NA
                                    ) {
    
    # FUNCTION ARGUMENTS:
    # dfDetail: data frame or tibble containing detailed (sub-cluster) data
    # dfAgg: data frame or tibble containing aggregated (cluster) data
    # aggVar: variable reflecting the aggregate level
    # detVar: variable reflecting the detailed level
    # p1Metrics: metrics to be shown for plot 1 (will be faceted)
    # p1Order: variable for ordering detVar in p1
    # p2DetMetrics: variables to be included from dfDetail for p2
    # p2AggMetrics: corresponding variables to be included from dfAgg for p2
    # p3Metrics: metrics to be included in the growth plots
    # p3Days: number of days to include for calculating growth
    # p3Slope: the slope for the dashed line for growth in p3
    # mapper mapping file for variable name to descriptive name
    # brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)    
    
    # Create plot for aggregates by sub-cluster
    if(detVar=="state") {
        p1 <- dfDetail %>%
            colSelector(vecSelect=c(detVar, aggVar, "date", p1Metrics)) %>%
            pivot_longer(all_of(p1Metrics)) %>%
            filter(!is.na(value)) %>%
            group_by_at(detVar) %>%
            filter(date==max(date)) %>%
            ungroup() %>%
            ggplot(aes(x=fct_reorder(get(detVar), 
                                     value, 
                                     .fun=function(x) { max(ifelse(name==p1Order, x, 0)) }
                                     ),
                       y=value
                       )
                   ) + 
            geom_col(aes(fill=get(aggVar))) + 
            facet_wrap(~mapper[name], scales="free_x") + 
            coord_flip() + 
            labs(title="Cumulative burden", x=NULL, y=NULL)
        if (!is.na(brewPalette)) p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette)
    } else {
        # Do not create the plot for other than state-level data
        p1 <- NULL
    }
    
    # Create facetted burden trends by aggregate
    # For state-level, create each state as a line
    # For anything else, create a 10%-90% range
    if (detVar=="state") {
        p2 <- dfDetail %>%
            colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
            pivot_longer(all_of(p2DetMetrics)) %>%
            filter(!is.na(value)) %>%
            ggplot(aes(x=date, y=value)) + 
            geom_line(aes_string(group=detVar), color="grey", size=0.5) + 
            facet_grid(mapper[name] ~ get(aggVar), scales="free_y") + 
            scale_x_date(date_breaks="2 months", date_labels="%b-%y") + 
            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
            labs(x=NULL, y=NULL, title="Burden by cluster and metric")
    } else {
        p2 <- dfDetail %>%
            colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
            pivot_longer(all_of(p2DetMetrics)) %>%
            filter(!is.na(value)) %>%
            group_by_at(c(aggVar, "name", "date")) %>%
            summarize(p10=unname(quantile(value, 0.1)), p90=unname(quantile(value, 0.9)), .groups="drop") %>%
            ggplot(aes(x=date)) + 
            geom_ribbon(aes(ymin=p10, ymax=p90), alpha=0.75) +
            facet_grid(mapper[name] ~ get(aggVar), scales="free_y") + 
            scale_x_date(date_breaks="2 months", date_labels="%b-%y") + 
            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
            labs(x=NULL, 
                 y=NULL, 
                 title="Burden by cluster and metric", 
                 subtitle="Shaded region is 10%le through 90%le, solid line is weighted mean"
                 )
    }
    aggPlot <- dfAgg %>% 
        colSelector(vecSelect=c("date", aggVar, p2AggMetrics)) %>%
        colRenamer(vecRename=purrr::set_names(p2DetMetrics, p2AggMetrics)) %>%
        pivot_longer(all_of(p2DetMetrics)) %>%
        filter(!is.na(value))
    p2 <- p2 + 
        geom_line(data=aggPlot, aes_string(color=aggVar, group=aggVar, y="value"), size=1.5)
    if (!is.na(brewPalette)) p2 <- p2 + 
        scale_color_brewer("Cluster", palette=brewPalette) + 
        theme(legend.position="none")
    
    # Create growth trends plot
    if (TRUE) {
        if(detVar=="countyFIPS") p3 <- dfDetail %>% filter(cluster != "999") else p3 <- dfDetail
        p3 <- p3 %>%
            colSelector(vecSelect=c(detVar, aggVar, "date", p3Metrics)) %>%
            pivot_longer(all_of(p3Metrics)) %>%
            filter(!is.na(value)) %>%
            group_by_at(c(detVar, "name")) %>%
            filter(date %in% c(max(date), max(date)-lubridate::days(p3Days))) %>%
            mutate(growth=max(value)-min(value)) %>%  # not ideal way to calculate
            filter(date==max(date)) %>%
            ungroup() %>%
            ggplot(aes(x=value, y=growth))
        if(detVar=="state") p3 <- p3 + geom_text(aes_string(color=aggVar, label=detVar), fontface="bold")
        else p3 <- p3 + geom_point(aes_string(color=aggVar))
        p3 <- p3 + 
            facet_wrap(~mapper[name], scales="free") + 
            labs(title=paste0("Current vs growth"), 
                 subtitle=paste0("Dashed line represents ", 
                                 round(100*p3Slope), 
                                 "% growth rate over past ", 
                                 p3Days, 
                                 " days"
                                 ),
                 x="Most recent cumulative", 
                 y=paste0("Growth over past ", p3Days, " days")
                 ) + 
            lims(y=c(0, NA), x=c(0, NA)) + 
            theme(panel.background = element_rect(fill = "white", colour = "white"), 
                  panel.grid.major = element_line(size = 0.25, linetype = 'solid', color = "grey")
                  ) + 
            geom_abline(slope=p3Slope, intercept=0, lty=2)
        if (!is.na(brewPalette)) { 
            p3 <- p3 + scale_color_brewer(stringr::str_to_title(aggVar), palette=brewPalette)
        }
        if(detVar=="countyFIPS") p3 <- p3 + labs(caption="Counties below population threshold excluded")
    } else {
        p3 <- NULL
    }
    
    # Return a list of plot objects
    list(p1=p1, p2=p2, p3=p3)
    
}



# Updated function for handling county-level clusters
createSummary <- function(df, 
                          stateClusterDF=NULL,
                          brewPalette=NA, 
                          dataType="state"
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: an integrated data frame by cluster-date
    # stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
    # brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
    # dataType: the type of maps being produced ("state" or "county")
    
    # Create plots that can be relevant for a dashboard, including:
    # 1. Map of segments
    # 2. Bar plot of counts by segment
    # 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
    # 4. Facetted trend-line plot of burden by segments
    # 5. Sparse county cluster map (if dataType is "county")
    
    # Create a map of the clusters
    p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create a bar plot of counts by segment
    p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           mapLevel=if(dataType=="state") "states" else "counties",
                           keyCol=if(dataType=="state") "state" else "countyFIPS",
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           countOnly=TRUE,
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create plot for population and burden by cluster
    p3 <- df %>%
        helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"), 
                       mapper=c("pop"="Population (millions)", 
                                "wm_tcpm7"="Cases per thousand", 
                                "wm_tdpm7"="Deaths per million"
                                ), 
                       xLab=NULL, 
                       yLab=NULL, 
                       title=NULL,
                       divideBy=c("pop"=1000000, "wm_tcpm7"=1000), 
                       extraArgs=if(is.na(brewPalette)) list() else 
                           list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                       )
    
    # Create plot for cumulative burden per million over time
    p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"), 
                   arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
                   )
    if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
    p4 <- df %>%
        helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL), 
                       mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)", 
                                "wm_tdpm7"="Deaths per million\n(cumulative)", 
                                "wm_hpm7"="Hospitalized per million\n(current)"
                                ),
                       yLab=NULL,
                       title=NULL, 
                       divideBy=c("wm_tcpm7"=1000), 
                       linesize=0.75,
                       extraArgs=p4xtra
                       )
    
    # Create the sparse county plot (if it is county data) - assumes default that '999' is 'too small'
    if (dataType=="county") {
        vecFrame <- if(is.null(stateClusterDF)) df else stateClusterDF
        vecFrame <- vecFrame %>% select(countyFIPS, cluster) %>%
            distinct()
        vecUse <- vecFrame$cluster %>% purrr::set_names(vecFrame$countyFIPS)
        p5 <- sparseCountyClusterMap(vecUse, 
                                     caption="Counties below population threshold left blank",
                                     brewPalette=if(is.na(brewPalette)) NULL else brewPalette
                                     )
    } else {
        p5 <- NULL
    }
    
    list(p1=p1, p2=p2, p3=p3, p4=p4, p5=p5)
    
}

The updated functions are tested:

# Confirm that new cluster maps are working as intended
cty_chksegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06", 
                                        dfPerCapita=cty_newsegs_20210608$dfPerCapita,
                                        useClusters=cty_newsegs_20210608$useClusters,
                                        skipAssessmentPlots=FALSE, 
                                        brewPalette="Paired"
                                        )

## NULL

The updated maps blank out the ‘999’ (small population) cluster as desired. The clusters are designed using a 3x3 of total burden (mainly death) and shape of the curve. Shapes of the curve are plotted, normalized so that each curve sums to 100%:

p1Data <- cty_chksegs_20210608$plotDataList$dfAgg %>% 
    pivotData(pivotKeys=c("cluster", "date", "pop")) %>% 
    filter(!is.na(value), cluster!="999", name %in% c("wm_cpm7", "wm_dpm7")) %>% 
    group_by(cluster, name) %>% 
    mutate(pct=value/sum(value)) %>% 
    ungroup()

# Plot all together
p1Data %>% 
    ggplot(aes(x=date, y=pct)) + 
    geom_line(aes(group=cluster, color=cluster), lwd=1) + 
    facet_wrap(~c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]) + 
    scale_color_brewer(palette="Paired") + 
    labs(x=NULL, 
         y="Percentage of total burden", 
         main="Burden over time, normalized to 100% cumulative burden"
         )

# Plot as facets
p1Data %>% 
    ggplot(aes(x=date, y=pct)) + 
    geom_line(aes(group=cluster, color=cluster), lwd=1) + 
    facet_grid(c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]~cluster) + 
    scale_color_brewer(palette="Paired") + 
    labs(x=NULL, 
         y="Percentage of total burden", 
         main="Burden over time, normalized to 100% cumulative burden"
         )

Broadly, the shapes selected by the clustering include:

  1. Two spikes, primary impact early
  2. Two spikes, balanced early and late impact
  3. One main spike, late

Broadly, there are examples of relatively higher and lower burden within each of these shapes, though the highest burden clusters mainly had two spikes with primary impact early and the medium burden clusters rarely had the primary impact early.

Next, all counties of at least 50,000 population are examined for the shape of the cumulative deaths curve:

p2Data <- cty_chksegs_20210608$plotDataList$dfFull %>% 
    select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
    pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>% 
    filter(!is.na(value), pop>=50000) %>% 
    group_by(countyFIPS, name) %>% 
    mutate(pct=value/max(value)) %>% 
    ungroup()

p2Dist <- p2Data %>% 
    filter(name=="tdpm7") %>% 
    select(countyFIPS, date, pct) %>% 
    pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="pct") %>%
    column_to_rownames("countyFIPS") %>%
    as.matrix() %>%
    dist()

p2Complete <- hclust(p2Dist, method="complete")
plot(p2Complete)

p2Single <- hclust(p2Dist, method="single")
plot(p2Single)

There is something peculiar about data in ‘32510’. A plot is created:

p2Data %>%
    ggplot(aes(x=date, y=pct)) +
    geom_line(data=~filter(., countyFIPS=="32510"), aes(group=countyFIPS), color="red", lwd=2) + 
    geom_line(data=~summarize(group_by(., name, date), pct=mean(pct), .groups="drop")) +
    facet_wrap(~c("tcpm7"="Rolling 7 cases", "tdpm7"="Rolling 7 deaths")[name]) + 
    labs(x=NULL, y="Percentage of total burden on day", title="Cluster 32510 is red, mean is black")

There is clearly a data issue with cluster 32510. Clusters are examined at a variable number of cut points:

plotHierarchical <- function(obj, k, df) {
    
    # FUNCtION ARGuMENTS:
    # obj: clustering object from hierarchical clustering
    # k: number of clusters to create
    # df: data frame with burden data
    
    # Counts by cluster
    ctCluster <- cutree(obj, k=k) %>%
        table() %>%
        print()
    
    # Make clustering data frame
    clData <- cutree(obj, k=k) %>%
        clustersToFrame(colNameName="countyFIPS") %>%
        joinFrames(df) %>%
        group_by(countyFIPS, name) %>%
        mutate(delta=ifelse(row_number()==1, pct, pct-lag(pct))) %>%
        ungroup() %>%
        group_by(cluster, date, name) %>%
        summarize(p05cum=quantile(pct, 0.05), 
                  mucum=mean(pct), 
                  p95cum=quantile(pct, 0.95), 
                  p05delta=quantile(delta, 0.05), 
                  mudelta=mean(delta), 
                  p95delta=quantile(delta, 0.95), 
                  .groups="drop"
                  )
    
    # Plot by cluster and metric
    p1 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mucum), lwd=1) + 
        geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
        facet_grid(c("tcpm7"="% cumulative cases", "tdpm7"="% cumulative deaths")[name]~cluster) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="% of cumulative", title="Mean and 90% range by cluster and metric")
    print(p1)
    
    # Plot incremental rather than cumulative
    p2 <- clData %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(color=cluster, y=mudelta), lwd=1) + 
        geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
        facet_grid(c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name]~cluster) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, y="% of total", title="Mean and 90% range by cluster and metric")
    print(p2)
    
    # Return the clustering frame
    clData
    
}

plotList <- lapply(2:5, FUN=function(x) plotHierarchical(p2Complete, k=x, df=p2Data))
## .
##   1   2 
## 841 149
## Joining, by = "countyFIPS"

## .
##   1   2   3 
## 773  68 149
## Joining, by = "countyFIPS"

## .
##   1   2   3   4 
## 498 275  68 149
## Joining, by = "countyFIPS"

## .
##   1   2   3   4   5 
## 498 275  68 114  35
## Joining, by = "countyFIPS"

At a glance, with 5 clusters, there is a clear early cluster, a clear early/late cluster, and a group of three seemingly similar clusters (primarily late). Further exploration of these three clusters may be merited.

The five clusters are examined on a single plot:

tempPlotter <- function(df, vecLate, facetScales="free_y", showRibbon=TRUE, showCum=FALSE) {

    p1 <- df %>% 
        mutate(lateCluster=(cluster %in% vecLate)) %>%
        ggplot(aes(x=date)) + 
        geom_line(aes(group=cluster, color=cluster, y=if(showCum) mucum else mudelta), lwd=1) + 
        facet_grid(lateCluster~c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name], 
                   scales=facetScales
                   ) + 
        scale_color_brewer(palette="Set1") + 
        labs(x=NULL, 
             y=NULL, 
             title="Disease burden by time period and shape cluster"
             )
    
    if (isTRUE(showRibbon)) {
        p1 <- p1 + 
            geom_ribbon(aes(ymin=if(showCum) p05cum else p05delta, 
                            ymax=if(showCum) p95cum else p95delta, 
                            fill=cluster, 
                            group=cluster
                            ), alpha=0.25) + 
            scale_fill_brewer(palette="Set1") + 
            labs(subtitle="Shaded region covers 90% of observations")
    }

    p1
    
}

tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE)

tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE, showCum=TRUE)

tempPlotter(plotList[[4]], vecLate=c(4, 5))

tempPlotter(plotList[[4]], vecLate=c(4, 5), showCum=TRUE)

Visually, there are meaningful distinctions in the clusters when overlaid, particularly with the shape of the cumulative curve:

The geographic locations of the five hierarchical cluster types are plotted:

tempMapper <- function(obj, 
                       k, 
                       regions="counties", 
                       varName=if(regions=="counties") "fips" else "state", 
                       values="cluster", 
                       lvlOrder=NULL, 
                       title=if(is.null(lvlOrder)) NULL else "Lightest colors have earliest burden",
                       caption=NULL
                       ) {
    
    df <- cutree(obj, k=k) %>%
        clustersToFrame(colNameName=varName) 
    
    if (!is.null(lvlOrder)) df[[values]] <- factor(df[[values]], levels=lvlOrder)
    
    df %>%
        sparseCountyClusterMap(brewPalette="viridis", caption=caption) + 
        labs(title=title)
    
}

tmpText <- "Plots only counties with at least 50k population"
tempMapper(p2Complete, k=2, lvlOrder=c(1, 2), caption=tmpText)

tempMapper(p2Complete, k=3, lvlOrder=c(2, 1, 3), caption=tmpText)

tempMapper(p2Complete, k=4, lvlOrder=c(3, 2, 1, 4), caption=tmpText)

tempMapper(p2Complete, k=5, lvlOrder=c(3, 2, 1, 4, 5), caption=tmpText)

Next steps are to create curves using larger counties, then assess how well they fit the shape of the curves in the smaller counties.